package belloCollector.util;

import org.jsoup.Jsoup;

import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.util.CharsetDetector;

public class HandlePageCharset {

	/**
	 * 处理页面字符集乱码
	 * 
	 * @param page
	 */
	public void handlePageCharset(Page page) {
		try {
			String responseContentType = page.getResponse().getContentType();
			if (responseContentType == null) {
				return;
			} else if (responseContentType.contains("text/html") && page.select("title") != null
					&& page.select("title").first().text().contains(" ")) {
				String charset = CharsetDetector.guessEncoding(page.getContent());
				if ("gb2312".equalsIgnoreCase(charset) || "gbk".equalsIgnoreCase(charset)) {
					page.setHtml(new String(page.getContent(), "utf-8"));
				} else if ("utf-8".equalsIgnoreCase(charset)) {
					page.setHtml(new String(page.getContent(), "gbk"));
				}
				page.setDoc(Jsoup.parse(page.getHtml(), page.getUrl()));
			}
		} catch (Exception e) {
			System.out.println(page.getUrl() + " : change charset error ! " + e.getMessage());
			// logger.warn(page.getUrl() + " : change charset error ! " +
			// e.getMessage());
		}
	}
}
